import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
total = pd.read_csv(os.getcwd()+'\\total.csv')
TTRAFFICREMOVE_RF = pd.read_csv(os.getcwd()+'\\TTRAFFICREMOVE_RF.csv')
pd.set_option('display.max_rows', 110)
total.head()
TTRAFFICREMOVE_RF.columns.values
# Combine Corridor and Direction:
corridor_df = total[['avppurple', 'avpblack', 'phl1orange',
'phl1red', 'phl1blue', 'phl1purple', 'phl2orange', 'phl2blue',
'phl2green', 'pit1orange', 'pit1red', 'pit1blue', 'pit1purple',
'pit2red', 'pit2blue']]
corridorindex = np.array([[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]])
corridorindex = corridorindex.transpose(1,0)
corridor = corridor_df.values
corridor = np.dot(corridor, corridorindex)
d = {1:'avppurple', 2:'avpblack', 3:'phl1orange',
4:'phl1red', 5:'phl1blue', 6:'phl1purple', 7:'phl2orange', 8:'phl2blue',
9:'phl2green', 10:'pit1orange', 11:'pit1red', 12:'pit1blue', 13:'pit1purple',
14:'pit2red', 15:'pit2blue'}
temp = np.asarray(corridor.transpose(1,0)[0])
corridorname = np.empty(len(corridor), dtype=object)
for i in range(1,16):
corridorname[np.where(temp==i)] = d[i]
del d, temp, corridor, corridorindex
corridorname_df = pd.Series(corridorname)
TTRAFFICREMOVE_RF['corridorname'] = corridorname_df
TTRAFFICREMOVE_RF.head()
# one-hot encoding using get_dummies
month = pd.get_dummies(TTRAFFICREMOVE_RF.month)
hour = pd.get_dummies(TTRAFFICREMOVE_RF.hour)
speedlimit = pd.get_dummies(TTRAFFICREMOVE_RF.speedlimit)
ampm = pd.get_dummies(TTRAFFICREMOVE_RF.ampm)
corridorname = pd.get_dummies(TTRAFFICREMOVE_RF['corridorname'])
TTRAFFICREMOVE_RF = TTRAFFICREMOVE_RF.drop(['month','hour','direction','location','speedlimit','ampm','corridorname'], axis=1)
TTRAFFICREMOVE_RF = pd.concat([month, hour, speedlimit, ampm, corridorname, TTRAFFICREMOVE_RF], axis=1)
TTRAFFICREMOVE_RF.head()
TTRAFFICREMOVE_RF.columns.values
feature_names = ['May','Jun','Jul','Aug','Sep','Oct','7AM','8AM','16PM',
'17PM','40mph','45mph','50mph','55mph','65mph','AM','PM',
'avpblack', 'avppurple', 'phl1blue','phl1orange','phl1purple',
'phl1red', 'phl2blue', 'phl2green', 'phl2orange', 'pit1blue',
'pit1orange', 'pit1purple', 'pit1red', 'pit2blue', 'pit2red',
'precipitation','visibility','speed']
TTRAFFICREMOVE_RF.columns = feature_names
print(TTRAFFICREMOVE_RF.columns.values)
TTRAFFICREMOVE_RF.shape
TTRAFFICREMOVE_RF.to_csv('TTRAFFICREMOVE_RF_R.csv', sep='\t', encoding='utf-8')
# labels are target variable (speed)
speed = np.array(TTRAFFICREMOVE_RF['speed'])
TTRAFFICREMOVE_RF = TTRAFFICREMOVE_RF.drop('speed', axis=1)
feature_list = list(TTRAFFICREMOVE_RF.columns)
TTRAFFICREMOVE_RF = np.array(TTRAFFICREMOVE_RF)
# Training and test set split
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(TTRAFFICREMOVE_RF, speed, test_size=0.25, random_state=42)
print('training feature shape: ', train_features.shape)
print('testing feature shape: ', test_features.shape)
print('training labels shape: ', train_labels.shape)
print('testing labels shape: ', test_labels.shape)
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=300, random_state=42)
rf.fit(train_features, train_labels)
# make prediction using testing set
predictions = rf.predict(test_features)
# calculate the MAE
mae = abs(predictions - test_labels)
# calculate the RMSE
from sklearn.metrics import mean_squared_error
import sklearn.metrics
from math import sqrt
rmse = sqrt(mean_squared_error(test_labels, predictions))
mse = rmse**2
Rsquare = sklearn.metrics.r2_score(test_labels, predictions, multioutput='uniform_average')
print('Mean Absolute Error:', round(np.mean(mae), 2), 'degrees.')
print('Rooted Mean Square Error: ', rmse)
print('Mean Square Error: ', mse)
print('R square: ', Rsquare)
# calculate mean absolute percentage error (MAPE)
mape = 100 * (mae / test_labels)
# calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('accuracy: ', round(accuracy, 2), '%.')
# Import tools needed for visualization
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
import pydot
from IPython.display import Image
import pydotplus
# Pull out one tree from the forest
tree = rf.estimators_[5]
# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list,
rounded = True, precision = 1)
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')
# Write graph to a png file
dot_data = StringIO()
export_graphviz(tree, out_file=dot_data,
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('tree.png')
Image(graph.create_png())
import matplotlib.image as mpimg
from PIL import *
img = mpimg.imread('tree.png')
imgplot = plt.imshow(img)
plt.axis('off')
plt.show()
# Limit depth of tree to 3 levels
rf_small = RandomForestRegressor(n_estimators=10, max_depth = 3)
rf_small.fit(train_features, train_labels)
# Extract the small tree
tree_small = rf_small.estimators_[5]
# Save the tree as a png image
export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = feature_list, rounded = True, precision = 1)
(graph, ) = pydot.graph_from_dot_file('small_tree.dot')
graph.write_png('small_tree.png');
plt.figure(figsize=(12,8))
img = mpimg.imread('small_tree.png')
imgplot = plt.imshow(img)
plt.axis('off')
plt.show()
In order to quantify the contribution of each feature to the whole random forest model, we should look at the relative importance of each feature. In this study, I'm using sklearn.RandomForestRegressor. By doing that we can find how much the prediction is improved after adding each feature.
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];
The result above can tell us that, in congested condition:
feature = []
for i in feature_importances:
feature.append(str(i[0]))
plt.rcdefaults()
fig, ax = plt.subplots()
ax.barh(np.arange(len(feature_importances)), sorted(importances, reverse=True), align='center') # feature_importances
ax.set_yticks(np.arange(len(feature_importances)))
ax.set_yticklabels(feature)
ax.invert_yaxis()
ax.set_xlabel('Feature Importance')
ax.set_title('Feature Importance Plot')
# Gradient Boosting
import glob
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble.partial_dependence import plot_partial_dependence
from sklearn.ensemble.partial_dependence import partial_dependence
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import FormatStrFormatter
import time
start_time = time.time()
params = {'n_estimators': 1500, 'max_depth': 4, 'min_samples_split': 2,
'learning_rate': 0.01, 'loss': 'ls'}
clf = ensemble.GradientBoostingRegressor(**params)
clf.fit(train_features, train_labels)
mse = mean_squared_error(test_labels, clf.predict(test_features))
print('MSE: %.4f' % mse)
print("--- %s seconds ---" % (time.time() - start_time))
test_score = np.zeros((params['n_estimators'],), dtype=np.float64)
for i, y_pred in enumerate(clf.staged_predict(test_features)):
test_score[i] = clf.loss_(test_labels, y_pred)
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title('Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',
label='Training Set Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')
# Plot feature importance
feature_importance = clf.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.subplot(1, 2, 2)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, feature_names)
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()
target_feature = [32, 33, (32,33)]
fig2, axs2 = plot_partial_dependence(clf, train_features, target_feature,
feature_names=feature_names,
n_jobs=3, grid_resolution=50)
fig.suptitle('Partial Dependence of Speed on Precipitation and Visibility')
plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle
fig = plt.figure()
target_feature = (32,33) # 32, 33 is precipitation and visibility
pdp, axes = partial_dependence(clf, target_feature,
X=train_features, grid_resolution=50)
XX, YY = np.meshgrid(axes[0], axes[1])
Z = pdp[0].reshape(list(map(np.size, axes))).T
ax = Axes3D(fig)
surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1,
cmap=plt.cm.BuPu, edgecolor='k')
ax.set_xlabel(feature_names[target_feature[0]])
ax.set_ylabel(feature_names[target_feature[1]])
ax.set_zlabel('Partial dependence')
# pretty init view
ax.view_init(elev=40, azim=75)
plt.colorbar(surf)
plt.suptitle('Partial Dependence of Speed on Precipitation and Visibility')
plt.subplots_adjust(top=0.9)
plt.show()